clear all 
set more off 
set maxvar 15000 
set matsize 11000
clear matrix
set seed 123454321 

    * Number of repetitions
    local reps=100 

**------------------------**
**------------------------**

/* Check if each triplet has enough N in the census. "Enough"
   means at least the same N as in the surveys.
   If not enough, expanded the Census/1936 data. */
   
    forval d=1/7 { 

        noisily display "setting up data for 19`d'0"
        noisily display c(current_time)

        use "$Mydirectory1/3_Output/3b_3surveys_19`d'0survey.dta", clear 

        count 
        local count_1=`r(N)'
        levelsof triplet, local(triplet_list) 

        foreach num in `triplet_list'  { 
            
            count if triplet==`num' & census==0 
            di "N for triplet `num' if census==0, 19`d'0, is `r(N)'" 
            local num_in_survey=`r(N)'
            count if triplet==`num' & census==1 
            di "N for triplet `num' if census==1, 19`d'0, is `r(N)'" 

            if `r(N)'< `num_in_survey' { 
                di "NOT ENOUGH N for `num' in 19`d'0!!!" 

                *Generate expansion factor 
                local factor=1+ceil(`num_in_survey'/`r(N)') 
                expand `factor' if triplet==`num' & census==1, gen(exp_triplet_`num')
            } 
        } 

        *Save a tempfile w/ expanded census observations as needed 
        tempfile simul_19`d'0 
        count 
        local count_2=`r(N)'
        di "orig N: `count_1'" 
        di "new N: `count_2'" 
        save `simul_19`d'0' 
    } 

**------------------------**
**------------------------**

* Next step: assign income via brute force 
        
    forval d=1/7 { 

        noisily display "simulation for 19`d'0"
        noisily display c(current_time)

        use `simul_19`d'0', clear 
        tab census 

    /* Before imputations, grab variance and residual variance 
       from census by decade */
        quietly sum log_father_hh_income if census==1 [aw=weight], d
        local tot_var_`d'0=`r(Var)'
        quietly reg log_father_hh_income i.triplet if census==1 [aw=weight] 
        local resid_var_`d'0=(`e(rmse)')^2 
        local r_sq_`d'0=`e(r2)'  
                
    /* For each person in our survey: replace their average income score 
       with a random draw from the same cell within the census  */

        levelsof triplet, local(triplet_list) 
        foreach num in `triplet_list'  { 
            di "Doing triplet `num'" 
            
            * Bring in  original dataset 
            use `simul_19`d'0', clear 
            
            * Keep observations with a given triplet & Census values 
            tab census if triplet==`num' 
            
            * Count how many survey observations we have for that triplet 
            di "N for triplet `num' if census==0, 19`d'0" 
            count if triplet==`num' & census==0 
            local num_in_survey=`r(N)' //save this
            di "N for triplet `num' if census==1 19`d'0" 
            count if triplet==`num' & census==1 
            
            /* See if the N for each Census triplet is big enough
               (i.e., at least as much as the survey N for that 
                triplet) */
            if `r(N)'< `num_in_survey' {
                di "NOT ENOUGH N for `num' in 19`d'0!!!" 
                stop 
            }
            else {
                di "enough observations" 
            }
                
        * Next: Permute the census data ordering 
            keep if triplet==`num' & census==1
            tempfile clust_data_`num' 
            save `clust_data_`num''
            
            * Re-order the census values `reps' # of times 
            forval x=1/`reps' { 
                use `clust_data_`num'', clear 
                
                gen random`x'=runiform()
                sort random`x' 
                
                gen inc_order=log_father_hh_income 
                keep if _n<=`num_in_survey' 
                gen within_clust_id=_n 
                mkmat triplet within_clust_id inc_order, matrix(sim_trip`num'_rep`x')
            }
        
        }

    /* Generate the simulated income draws for each survey observation 
       Note: Only need survey data  */ 
        use `simul_19`d'0', clear 

        keep if census==0 
        di "How many observations are we starting with?" 
        count 
        
        * Fix the ordering of the survey data 
        gen random=runiform()
        sort triplet random 
        bys triplet: gen ordering=_n 

        forval x=1/`reps' { 
            gen inc_sim_`x'=.
            
            foreach num in `triplet_list' { 
                count if triplet==`num' 
                local num_in_survey=`r(N)'
                
                forval n=1/`num_in_survey' { 
                    quietly replace inc_sim_`x'=sim_trip`num'_rep`x'[`n',3] if triplet==`num' & ordering==`n' 
                } 
            } 
        } 

    * Calculate IGE and save betas
        display "IGE time"
        
        gen beta=. 
        gen sample_size=. 

        forval x=1/`reps' {
            quietly reg log_son_baseline inc_sim_`x' [aw=weight], robust

                quietly replace beta=_b[inc_sim_`x'] if _n==`x' 
                quietly count if e(sample)==1 
                quietly replace sample_size=`r(N)' if _n==`x'   
        } 

        sum beta, d 
        local beta_`d'0=`r(mean)'
        
        sum sample_size 
        local samp_size_`d'0=`r(mean)'

    * Save output
        gen iter = _n
        keep if iter<=`reps'
        
        keep iter beta sample_size
        rename beta beta_`d' 
        rename sample_size sample_size_`d'
        
        tempfile output_`d'
        save `output_`d''
    
    } 
    
* Display output
    noisily di "IGE coefficients by decade" 
    noisily di "1910:" `beta_10'
    noisily di "1920:" `beta_20'
    noisily di "1930:" `beta_30'
    noisily di "1940:" `beta_40'
    noisily di "1950:" `beta_50'
    noisily di "1960:" `beta_60'
    noisily di "1970:" `beta_70'

    noisily di "sample size by decade" 
    forval i=1(1)7 {
        noisily di "19`i'0:" `samp_size_`i'0'
    }

    noisily di "Var, resid var, r2 by decade" 
    forval d=1/7 { 
        noisily di "19`d'0.    Tot var: `tot_var_`d'0'.  Resid var: `resid_var_`d'0'.  Rsq:   `r_sq_`d'0'"
    } 
    
* Append output and save
    noisily display "merge output"

    use `output_1', clear
    forval i=2(1)7 {
        merge 1:1 iter using `output_`i'', nogen
    }
    
    save "$Mydirectory2/appendix_b/beta_IGE_output.dta", replace 